Assignment 1. Perception Visualization

olive <- read.csv("olive.csv", sep = ",", header = TRUE)
library(ggplot2)
library(plotly)
library(MASS)
library(gridExtra)
p_vs_o <- ggplot(olive, aes(x=oleic, y=palmitic,
                     color = linolenic)) +
  geom_point() 
  #xlab("")
p_vs_o

olive$linolenic2 <- cut_interval(olive$linolenic, n=4)
p_vs_o2 <- ggplot(olive, aes(x=oleic, y=palmitic, color = linolenic2)) + 
  geom_point()

p_vs_o2

pal_vs_ole_col <- ggplot(olive, aes(palmitic, oleic)) +
  geom_point(aes(color=linolenic2))

pal_vs_ole_col

pal_vs_ole_size <- ggplot(olive, aes(palmitic, oleic)) +
  geom_point(aes(size=linolenic2))

pal_vs_ole_size

pal_vs_ole_angle <- ggplot(olive, aes(palmitic, oleic)) +
  geom_point() +
  geom_spoke(aes(angle=as.numeric(linolenic2),radius=20))

pal_vs_ole_angle

o_vs_e <- ggplot(olive, aes(x=oleic, y=eicosenoic, color = Region)) + 
  geom_point()
o_vs_e

o_vs_e0 <- ggplot(olive, aes(x=oleic, y = eicosenoic, color = as.factor(Region)))+
  geom_point()

o_vs_e0

olive$linoleic2 <- cut_interval(olive$linoleic, n=3)
olive$palmitic2 <- cut_interval(olive$palmitic, n=3)
olive$palmitoleic2 <- cut_interval(olive$palmitoleic, n=3)
o_vs_e1 <- ggplot(olive, aes(x=oleic, y=eicosenoic)) + 
  geom_point(aes(color=linoleic2, size=palmitoleic2, shape=palmitic2))
o_vs_e1

o_vs_e2 <- ggplot(olive, aes(x=oleic, y=eicosenoic)) + 
  geom_point(aes(color=Region, size=palmitoleic2, shape=palmitic2))

o_vs_e2

plot_ly(data = olive, labels=~Area, type = 'pie', showlegend = F) %>%
  layout(title = 'Proportion of Oils from different regions',
        xaxis = list(showgrid = F, zeroline = F, showticklables = F),
         yaxis = list(showgrid = F, zeroline = F, showticklables = F))
l_vs_e <- ggplot(olive, aes(x = linoleic, y = eicosenoic)) +
  geom_point() + 
  geom_density_2d()#+
  #stat_density_2d(aes(fill = stat(level)), geom = "polygon")
l_vs_e  

Assignment 2: Multidimensional Scaling of a high-dimensional object

# reading the data
baseball <- read.csv("baseball-2016.csv", sep = ",", header = TRUE)

Question1

The xlsx file is converted to a csv file, makes it easier to load the data without extra packages. It is reasonable to scale the data because they have different ranges so that features with large scales do not dominate. Example, comparing maximum value and minimum value for “AB” and “Home runs per game”. The two are on totally different scales.

# range of AB
range1 <- max(baseball$AB) -  min(baseball$AB)
range1
## [1] 340
# range of HR per game
range2 <- max(baseball$HR.per.game) - min(baseball$HR.per.game)

range2
## [1] 0.8039644

The density plot below graphically explains why scaling is needed. Appropriate density should be similar to log normal density.

plot(density(as.matrix(baseball[3:ncol(baseball)])))

Question 2

# scale the 26 numeric columns
baseball2 <- scale(baseball[ ,3:ncol(baseball)])
# Getting distance between points using Minkowski
mink_d <- dist(baseball2, method = "minkowski", p =2)
# get 2 column vector with fitted configuration 
resid <- isoMDS(mink_d, k=2 )
## initial  value 19.856833 
## iter   5 value 16.319153
## iter  10 value 16.046215
## final  value 15.935476 
## converged
# getting the coordinates
coords <- resid$points

# convert coords to dataframe
coordsMDS <- as.data.frame(coords)

# cbind to get column Team and League
coordsMDS2 <- cbind(baseball[ , 1:2], coordsMDS)
# plot the new dimension and color by  League
plot_ly(coordsMDS2, x= ~V1, y = ~V2, type ="scatter", hovertext = ~ Team,
            color = ~League, colors = c("#0444BF","#F46A4E"))

There appears to be a difference between the two leagues; most of the NL league teams are on the second and third quadrant (negative V2) while most of the AL league teams are on the first and fourth quadrant (positive V2). The y-axis “V2” has the best differentation between the leagues. The Boston Red Sox, Atlanta Braves and Philadelphia Philies appear to be outliers.

# Shepard diagrams assess the goodness-of-fit of MDS techniques
# run shep on points from isoMDS and using same distance as calculated in previous step
shep <- Shepard(mink_d, coords)
# convert the distance to numeric
delta <- as.numeric(mink_d)
# All the coords in one column
D <- as.numeric(dist(coords))
# create a square matrix with dimensions from the rows of coords
# n as rows 
n = nrow(coords)

# index as empyt matrix
index <- matrix(1:n, nrow = n, ncol = n)

# get index of the lower triangle of the index matrix and convert it to numeric
index1 <- as.numeric(index[lower.tri(index)])
# same procedur
n = nrow(coords)

index <- matrix(1:n, nrow=n, ncol=n, byrow = T)

index2 <- as.numeric(index[lower.tri(index)])
# i dont want to edit original data
baseball3 <- baseball

rownames(baseball3) <-baseball3[ ,1]
# plotly
plot_ly()%>%
  add_markers(x=~delta, y=~D, hoverinfo = 'text',
        text = ~paste('Obj1: ', rownames(baseball3)[index1],
                      '<br> Obj2: ', rownames(baseball3)[index2])) %>%
  #if nonmetric MDS inolved
  add_lines(x=~shep$x, y=~shep$yf)
# get MDS variable V2 and cbind with points from isoMDS
d4 <- cbind(baseball[ ,1:2], coords[,2], baseball2[,])

# rename col3 to V2
colnames(d4)[3] <- c("V2")
# function to successively get plots of V2 vs other variables
myplot <-function(df, y_string){
  ggplot(df, aes_string(x = "V2", y = y_string, color = "League"))+
    geom_point()+
    geom_vline(xintercept = 0, color = "black")+
    geom_hline(yintercept = 0, color = "black")
}
# Variables that seem to have strongest postive relationship
# HR per game:
myplot(d4, "HR.per.game")

# HR
myplot(d4, "X3B")

Per the plot, HR per game and HR have the same scatter points. The two have strongest positve connection to V2 while X3B has the strongest negative connection to V2.